import os
import json
import gzip
from csv  import writer
import csv
import pandas as pd
import numpy as np
import subprocess as sub
import re
from ast import literal_eval
from scipy.spatial import cKDTree
from math import isnan
import sys

# script to process from twitter api json

print "\n".join(sys.argv)
last_try = sys.argv[1]

in_path1 = ''
in_path2 = ''
out_path = ''

def centeroidpython(data):
    x, y = zip(*data)
    l = len(x)
    return round(sum(x) / l, 3), round(sum(y) / l, 3)

thefiles = os.listdir(in_path1)
thefiles.extend(os.listdir(in_path2))
thefiles.sort()

print '\nNumber of files: ' + str(len(thefiles)) + '\n'

os.chdir(out_path)

arabic_names = pd.read_csv('../arabic_names.csv')
arabic_names = set(arabic_names.x.tolist())
nonarabic_names = pd.read_csv('../nonarabic_names.csv')
nonarabic_names = set(nonarabic_names.x.tolist())

zips = pd.read_csv(in_path2 + '2010_Gaz_zcta_national.csv')
zip_list = zip(zips['INTPTLONG'],zips['INTPTLAT'])
tree = cKDTree(np.array(zip_list))

j = 0
print 'Getting tweet geos..\n'
for fn in thefiles:
    j+=1
    if j%24 == 1:
        print '  ' + str(fn) + ''
        #
        # missing hours
    if fn.replace('.txt.gz', '') + '_short_withtext_zips.txt.gz' in ['2015_11_28_20_usgeo_short_withtext_zips.txt.gz','2016_04_12_08_usgeo_short_withtext_zips.txt.gz','2016_04_27_01_usgeo_short_withtext_zips.txt.gz','2015_11_28_21_usgeo_short_withtext_zips.txt.gz','2015_11_28_20_usgeo_short_withtext_zips.txt.gz','2015_01_14_16_usgeo_short_withtext_zips.txt.gz','2015_01_14_05_usgeo_short_withtext_zips.txt.gz','2015_02_03_03_usgeo_short_withtext_zips.txt.gz','2015_02_07_22_usgeo_short_withtext_zips.txt.gz','2015_02_13_00_usgeo_short_withtext_zips.txt.gz','2015_02_13_19_usgeo_short_withtext_zips.txt.gz','2015_02_13_20_usgeo_short_withtext_zips.txt.gz','2015_02_13_21_usgeo_short_withtext_zips.txt.gz','2015_02_14_03_usgeo_short_withtext_zips.txt.gz','2015_02_13_23_usgeo_short_withtext_zips.txt.gz','2015_02_14_01_usgeo_short_withtext_zips.txt.gz','2015_02_14_04_usgeo_short_withtext_zips.txt.gz','2014_12_03_10_usgeo_short_withtext_zips.txt.gz']:
        continue
    #
    # restarts/parallelization hack
    #
    if os.path.isfile(os.path.abspath(out_path + last_try)):
        if fn in list(pd.read_csv(os.path.abspath(out_path + last_try))['last_try']):
            continue
        #
    if os.path.isfile(os.path.abspath(out_path + 'last_try.csv')):
        if fn in list(pd.read_csv(os.path.abspath(out_path + 'last_try.csv'))['last_try']):
            continue
        #
    if os.path.isfile(os.path.abspath(out_path + 'last_try_2.csv')):
        if fn in list(pd.read_csv(os.path.abspath(out_path + 'last_try_2.csv'))['last_try']):
            continue
        #
    if os.path.isfile(os.path.abspath(out_path + 'last_try_3.csv')):
        if fn in list(pd.read_csv(os.path.abspath(out_path + 'last_try_3.csv'))['last_try']):
            continue
        #
    if os.path.isfile(os.path.abspath(out_path + 'last_try_4.csv')):
        if fn in list(pd.read_csv(os.path.abspath(out_path + 'last_try_4.csv'))['last_try']):
            continue
        #
    #
    if os.path.isfile(os.path.abspath(out_path + last_try)):
        with open(os.path.abspath(out_path + last_try), 'a') as f:
              a = writer(f, delimiter=",")
              a.writerows([[fn]])
    else:
        with open(os.path.abspath(out_path + last_try), 'w') as f:
              a = writer(f, delimiter=",")
              a.writerows([['last_try'],[fn]])
              #
              #
              #
    nexceptions = 0
    if os.path.isfile(os.path.abspath(in_path1 + fn)):
        tweets = []
        for line in gzip.open(os.path.abspath(in_path1 + fn), 'r'):
            try:
                tweets.append(json.loads(line))
            except Exception as e:
                nexceptions+=1
                # pass
                #
    if os.path.isfile(os.path.abspath(in_path2 + fn)):
        tweets = []
        for line in gzip.open(os.path.abspath(in_path2 + fn), 'r'):
            try:
                tweets.append(json.loads(line))
            except Exception as e:
                nexceptions+=1
                # pass
        #
        #
    print '      # of read line exceptions: ' + str(nexceptions) + ' (hour ' + fn[11:13] + ')'
    #
    users = [tweet['user']['screen_name'] if 'user' in tweet.keys() else np.nan for tweet in tweets]
    ids = [tweet['user']['id_str'] if 'user' in tweet.keys() else np.nan for tweet in tweets]
    names = [tweet['user']['name'] if 'user' in tweet.keys() else np.nan for tweet in tweets]
    lang = [tweet['user']['lang'] if 'user' in tweet.keys() else np.nan for tweet in tweets]
    tweet_ids = [tweet['id_str'] if 'id_str' in tweet  else np.nan for tweet in tweets]
    timestamps = [tweet['timestamp_ms'] if 'timestamp_ms' in tweet  else np.nan for tweet in tweets]
    coords = [tuple([round(tweet['coordinates']['coordinates'][0], 3), round(tweet['coordinates']['coordinates'][1], 3)])  if 'coordinates' in tweet and hasattr(tweet['coordinates'], 'keys')  else np.nan for tweet in tweets]
    text = [tweet['text']  if 'text' in tweet  else np.nan for tweet in tweets]
    places = [tweet['place'] if 'place' in tweet.keys() and hasattr(tweet['place'], 'keys') else {u'full_name': np.nan, u'place_type': np.nan, u'bounding_box': {u'coordinates': [[[np.nan,np.nan],[np.nan,np.nan],[np.nan,np.nan],[np.nan,np.nan]]]}} for tweet in tweets]
    place_types = [place['place_type']  if hasattr(place, 'keys') else np.nan for place in places]
    place_names = [place['full_name']  if hasattr(place, 'keys') else np.nan for place in places]
    centroids = [centeroidpython(place['bounding_box']['coordinates'][0])  if hasattr(place, 'keys') and 'bounding_box' in place.keys() and place['bounding_box'] is not None else np.nan for place in places]
        #
    DF = pd.DataFrame({'user': users, 'name': names, 'lang': lang, 'id': ids, 'timestamp': timestamps, 'coord': coords, 'place_name': place_names, 'place_type': place_types, 'centroid': centroids, 'text': text, 'tweet_id': tweet_ids})
        #
    DF.name = DF.name.str.replace('[^\x00-\x7F]','')
    DF.place_name = DF.place_name.str.replace('[^\x00-\x7F]','')
    DF.text = DF.text.str.replace('[^\x00-\x7F]','')
        #
    the_centroids = []
    for x in DF.centroid:
        try:
            the_centroids.append(literal_eval(str(x)))
        except Exception as e:
            the_centroids.append(literal_eval('(2000, 2000)'))
        #
    the_coords = []
    for x in DF.coord:
        try:
            the_coords.append(literal_eval(str(x)))
        except Exception as e:
            the_coords.append(literal_eval('(2000, 2000)'))
        #
    dists, indexes = tree.query(np.array(the_centroids), k=1, distance_upper_bound=1000)
    DF['distance_nearest_zip_centroids']  = dists
    DF['nearest_zip_centroids']  = [zips.GEOID[i] if i != len(zip_list) else '' for i in indexes]
    #
    dists, indexes = tree.query(np.array(the_coords), k=1, distance_upper_bound=1000)
    DF['distance_nearest_zip_coords']  = dists
    DF['nearest_zip_coords']  = [zips.GEOID[i] if i != len(zip_list) else '' for i in indexes]
    #
    DF['missing_coords'] = DF['distance_nearest_zip_coords'].apply(np.isinf)
    #
    #
    DF['arabic_name'] = [y in arabic_names for y in [x.split(" ")[0].lower() for x in DF.name.astype(str)]]
    DF['nonarabic_name'] = [y in nonarabic_names for y in [x.split(" ")[0].lower() for x in DF.name.astype(str)]]
    #
    DF[['id','timestamp','tweet_id','lang','place_name','place_type','user','distance_nearest_zip_centroids','nearest_zip_centroids','distance_nearest_zip_coords','nearest_zip_coords','arabic_name','nonarabic_name','missing_coords','centroid','coord','name','text']].to_csv(
        os.path.abspath(out_path + fn.replace('.txt.gz', '') + '_short_withtext_zips.txt'),
        sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC
    )
    p = sub.Popen(['gzip', '-f', out_path + fn.replace('.txt.gz', '') + '_short_withtext_zips.txt'],stdout=sub.PIPE,stderr=sub.PIPE)
    DF[['id','timestamp','tweet_id','lang','place_name','place_type','distance_nearest_zip_centroids','nearest_zip_centroids','distance_nearest_zip_coords','nearest_zip_coords','arabic_name','nonarabic_name','missing_coords']].to_csv(
        os.path.abspath(out_path + fn.replace('.txt.gz', '') + '_short_zips.txt'),
        sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC
    )
    p = sub.Popen(['gzip', '-f', out_path + fn.replace('.txt.gz', '') + '_short_zips.txt'],stdout=sub.PIPE,stderr=sub.PIPE)


print '\nDone (EOF).\n\n'
